Set pipeline:

Load preprocessed dataset (preprocessing code in /../Gandal/data_preprocessing.Rmd)

Keep DE genes

datExpr = datExpr %>% filter(rownames(.) %in% rownames(DE_info)[DE_info$padj<0.05])
rownames(datExpr) = datGenes$feature_id[DE_info$padj<0.05 & !is.na(DE_info$padj)]
datGenes = datGenes %>% filter(feature_id %in% rownames(DE_info)[DE_info$padj<0.05])
DE_info = DE_info %>% filter(padj<0.05)

print(paste0('Keeping ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "Keeping 3000 genes and 80 samples"




Define a gene co-expression similarity

Using Biweight midcorrelation because it’s more robust to outliers than regular correlation or Mutual Information score

allowWGCNAThreads()
## Allowing multi-threading with up to 4 threads.
# MAD = median absolute deviation
cor_mat = datExpr %>% t %>% bicor

Correcting the correlation matrix from \(s \in [-1,1]\) to \(s \in [0,1]\). Two methods are proposed: \(s_{ij}=|bw(i,j)|\) and \(s_{ij}=\frac{1+bw(i,j)}{2}\)

-Using \(s_{ij}=\frac{1+bw(i,j)}{2}\), the strongest negative correlations (-1) get mapped to 0 (no correlation) and the zero correlated genes get mapped to the average correlation (0.5), which I don’t think makes much sense

-Using \(s_{ij}=|bw(i,j)|\) we lose the direction of the correlation, but at least we maintain the magnitude of the correlation of all the genes. Decided to use this one

S = abs(cor_mat)

Clustering

Identifying gene modules

Using 1-S sa distance matrix

diss_S = 1-S
dend = diss_S %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)

Using Dynamic Tree Cut, a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications

modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)

clusterings[['S']] = modules

Merging modules with similar expression profiles

Calculate the “eigengenes” (1st principal component) of each module and merging similar modules

module_colors = c('gray', viridis(max(modules)))

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=modules)
MEs = MEs_output$eigengenes

# Merge similar modules
cor_dist = 1-cor(MEs)
dend_MEs = cor_dist %>% as.dist %>% hclust(method='average')
dend_MEs %>% as.dendrogram %>% plot(ylim=c(0, 0.5))
abline(h=0.45, col='#0099cc')
abline(h=0.2, col='#009999')

# Two main modules
tree_cut = cutree(dend_MEs, h=0.45)
top_modules = modules %>% replace(modules %in% (gsub('ME', '', names(tree_cut[tree_cut==1])) %>% as.numeric), 1) %>%
                          replace(modules %in% (gsub('ME', '', names(tree_cut[tree_cut==2])) %>% as.numeric), 2) %>%
                          replace(modules == 0, 0)
clusterings[['S_top_clusters']] = top_modules

# Closest modules
tree_cut = cutree(dend_MEs, h=0.2)
merged_modules = modules
n=0
for(i in sort(unique(tree_cut))){
  n=n+1
  merged_modules = merged_modules %>% 
                   replace(modules %in% (gsub('ME', '', names(tree_cut[tree_cut==i])) %>% as.numeric), n)
}
merged_modules = merged_modules %>% replace(modules == 0, 0)

clusterings[['S_merged']] = merged_modules
top_module_colors = c('gray', viridis(max(top_modules)))
merged_module_colors = c('gray', viridis(length(unique(merged_modules))))

dend_colors = data.frame('ID'=names(modules[labels(dend)]),
                         'OriginalModules' = module_colors[modules[dend$order]+1],
                         'MergedModules' = merged_module_colors[merged_modules[dend$order]+1],
                         'TopModules' = top_module_colors[top_modules[dend$order]+1])

dend %>% as.dendrogram(hang=0) %>% set('labels', rep('', nrow(diss_S))) %>% plot(ylim=c(min(dend$height),1))
colored_bars(colors=dend_colors[,-1])

rm(MEs, modules, module_colors, MEs_output, top_modules, merged_modules, tree_cut, merged_module_colors, 
   top_module_colors, dend_colors, i, diss_S)




Define a family of adjacency functions

Chose power adjacency function over the sigmoid function because it has only one parameter to adjust and both methods are supposed to lead to very similar results if the parameters are chosen with the scale-free topology criterion.

Choosing a parameter value

Following the scale-free topology criterion because metabolic networks have been found to display approximate scale free topology

  1. Only consider those parameter values that lead to a network satisfying scale-free topology at least approximately, e.g. signed \(R^2 > 0.80\)
best_power = datExpr %>% t %>% pickSoftThreshold(powerVector = 1:15, RsquaredCut=0.8)
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k.  max.k.
## 1      1  0.00342  0.261          0.988 775.000  770.0000 1330.00
## 2      2  0.24500 -1.490          0.986 257.000  247.0000  636.00
## 3      3  0.58100 -2.220          0.963  98.200   90.3000  331.00
## 4      4  0.74900 -2.510          0.990  41.600   36.6000  186.00
## 5      5  0.83500 -2.560          0.989  19.200   15.8000  110.00
## 6      6  0.87300 -2.590          0.994   9.460    7.2500   67.80
## 7      7  0.90000 -2.520          0.994   4.960    3.4900   43.50
## 8      8  0.90800 -2.440          0.988   2.740    1.7600   28.80
## 9      9  0.88700 -2.380          0.951   1.590    0.9140   19.70
## 10    10  0.39100 -3.380          0.358   0.960    0.4870   13.70
## 11    11  0.38500 -3.210          0.349   0.603    0.2660    9.78
## 12    12  0.93500 -1.980          0.982   0.391    0.1490    7.11
## 13    13  0.95700 -1.830          0.985   0.262    0.0850    5.25
## 14    14  0.93300 -1.770          0.941   0.180    0.0495    3.98
## 15    15  0.94200 -1.780          0.964   0.126    0.0295    3.36
print(paste0('Best power for scale free topology: ', best_power$powerEstimate))
## [1] "Best power for scale free topology: 5"

Elevate the matrix to the suggested power

S_sft = S^best_power$powerEstimate

Clustering

Identifying gene modules

Using 1-S_sft sa distance matrix

diss_S_sft = 1-S_sft
dend = diss_S_sft %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)

Using Dynamic Tree Cut, a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications

modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)

clusterings[['S_sft']] = modules

Merging modules with similar expression profiles

Calculate the “eigengenes” (1st principal component) of each module and merging similar modules

module_colors = c('gray', viridis(max(modules)))

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=modules)
MEs = MEs_output$eigengenes

# Merge similar modules
cor_dist = 1-cor(MEs)
dend_MEs = cor_dist %>% as.dist %>% hclust(method='average')
dend_MEs %>% as.dendrogram %>% plot(ylim=c(0, 0.6))
abline(h=0.55, col='#0099cc')
abline(h=0.23, col='#009999')

# Two main modules
tree_cut = cutree(dend_MEs, h=0.55)
top_modules = modules %>% replace(modules %in% (gsub('ME', '', names(tree_cut[tree_cut==1])) %>% as.numeric), 1) %>%
                          replace(modules %in% (gsub('ME', '', names(tree_cut[tree_cut==2])) %>% as.numeric), 2) %>%
                          replace(modules == 0, 0)
clusterings[['S_sft_top_clusters']] = top_modules

# Closest modules
tree_cut = cutree(dend_MEs, h=0.23)
merged_modules = modules
n=0
for(i in sort(unique(tree_cut))){
  n=n+1
  merged_modules = merged_modules %>% 
                   replace(modules %in% (gsub('ME', '', names(tree_cut[tree_cut==i])) %>% as.numeric), n)
}
merged_modules = merged_modules %>% replace(modules == 0, 0)

clusterings[['S_sft_merged']] = merged_modules
merged_module_colors = c('gray', viridis(length(unique(merged_modules))))
top_module_colors = c('gray', viridis(max(top_modules)))

dend_colors = data.frame('ID'=names(modules[labels(dend)]),
                         'OriginalModules' = module_colors[modules[dend$order]+1],
                         'MergedModules' = merged_module_colors[merged_modules[dend$order]+1],
                         'TopModules' = top_module_colors[top_modules[dend$order]+1])

dend %>% as.dendrogram(hang=0) %>% set('labels', rep('', nrow(S_sft))) %>% plot(ylim=c(min(dend$height),1))
colored_bars(colors=dend_colors[,-1])

rm(MEs, modules, module_colors, MEs_output, top_modules, merged_modules, tree_cut, merged_module_colors, 
   top_module_colors, dend_colors, i, diss_S_sft)




Defining a measure of node dissimilarity

Using topological overlap dissimilarity measure because it has been found to result in biologically meaningful modules

Create Topological Overlap Matrix (TOM)

1st quartile is already 0.9852, most of the genes are very dissimilar

TOM = S_sft %>% TOMsimilarity
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM = 1-TOM

rownames(dissTOM) = rownames(S_sft)
colnames(dissTOM) = colnames(S_sft)

dissTOM %>% melt %>% summary
##               Var1                      Var2             value       
##  ENSG00000000971:   3000   ENSG00000000971:   3000   Min.   :0.0000  
##  ENSG00000001084:   3000   ENSG00000001084:   3000   1st Qu.:0.9852  
##  ENSG00000001626:   3000   ENSG00000001626:   3000   Median :0.9918  
##  ENSG00000002586:   3000   ENSG00000002586:   3000   Mean   :0.9882  
##  ENSG00000002746:   3000   ENSG00000002746:   3000   3rd Qu.:0.9956  
##  ENSG00000002933:   3000   ENSG00000002933:   3000   Max.   :0.9999  
##  (Other)        :8982000   (Other)        :8982000




Identifying gene modules

Using hierarchical clustering using average linkage on the TOM-based dissimilarity matrix

dend = dissTOM %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)

Instead of using a fixed height barch to cut the dendrogram into clusters, using a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications

Two available methods:

  1. Dynamic Tree Cut: top-down algorithm relying only on the dendrogram and respecting the order of the clustered objects on it. This method is less sensitive to parameter choice but also less flexible (I think I prefer robustness over flexibility, so I’m going to use this one)

  2. Dynamic Hybrid Cut: builds the clusters from bottom up. In addition to information from the dendrogram, it utilizes dissimilarity information among the objects. Seems to me that relies on too many heuristics and has too many parameters to tune

Plus a post processing step:

modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)

clusterings[['TOM']] = modules

Merging modules with similar expression profiles

“One can relate modules to each other by correlating the corresponding module eigengenes (Horvath et al., 2005). If two modules are highly correlated, one may want to merge them”

Calculate the “eigengenes” (1st principal component) of each module

module_colors = c('gray', viridis(max(modules)))

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=modules)
MEs = MEs_output$eigengenes

Merge similar modules

cor_dist = 1-cor(MEs)
dend_MEs = cor_dist %>% as.dist %>% hclust(method='average')
dend_MEs %>% as.dendrogram %>% plot(ylim=c(0, 0.6))
abline(h=0.55, col='#0099cc')
abline(h=0.228, col='#009999')

# Two main modules
tree_cut = cutree(dend_MEs, h=0.55)
top_modules = modules %>% replace(modules %in% (gsub('ME', '', names(tree_cut[tree_cut==1])) %>% as.numeric), 1) %>%
                          replace(modules %in% (gsub('ME', '', names(tree_cut[tree_cut==2])) %>% as.numeric), 2) %>%
                          replace(modules == 0, 0)
clusterings[['TOM_top_clusters']] = top_modules

# Closest modules
tree_cut = cutree(dend_MEs, h=0.228)
merged_modules = modules
n=0
for(i in sort(unique(tree_cut))){
  n=n+1
  merged_modules = merged_modules %>% 
                   replace(modules %in% (gsub('ME', '', names(tree_cut[tree_cut==i])) %>% as.numeric), n)
}
merged_modules = merged_modules %>% replace(modules == 0, 0)

clusterings[['TOM_merged']] = merged_modules
merged_module_colors = c('gray', viridis(length(unique(merged_modules))))
top_module_colors = c('gray', viridis(max(top_modules)))

dend_colors = data.frame('ID'=names(modules[labels(dend)]),
                         'OriginalModules' = module_colors[modules[dend$order]+1],
                         'MergedModules' = merged_module_colors[merged_modules[dend$order]+1],
                         'TopModules' = top_module_colors[top_modules[dend$order]+1])

dend %>% as.dendrogram(hang=0) %>% set('labels', rep('', nrow(dissTOM))) %>% plot(ylim=c(min(dend$height),1))
colored_bars(colors=dend_colors[,-1])

rm(MEs, dend, modules, module_colors, MEs_output, top_modules, merged_modules, tree_cut, top_module_colors, 
   merged_module_colors, cor_dist, dend_colors, i, dend_MEs)



Exploratory analysis of clustering

Adjusted Rand Index comparison

cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
  cluster1 = sub(0, NA, clusterings[[i]]) %>% as.factor
  for(j in (i):length(clusterings)){
    cluster2 = sub(0, NA, clusterings[[j]]) %>% as.factor
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = F, Colv = F, dendrogram = 'none', col=rev(brewer.pal(9,'YlOrRd'))[4:9],
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(12,12))

rm(i, j, cluster1, cluster2, cluster_sim)

PCA

Cluster don’t follow any strong patterns, at least in the first principal components

pca = datExpr %>% prcomp

plot_data = data.frame('ID' = rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 
                       'PC3' = pca$x[,3], 'PC4' = pca$x[,4], 'PC5' = pca$x[,5],
                       'S' = sub(0,NA,clusterings[['S']]) %>% as.factor, 
                       'S_top_clusters' = sub(0,NA,clusterings[['S_top_clusters']]) %>% as.factor, 
                       'S_merged' = sub(0,NA,clusterings[['S_merged']]) %>% as.factor, 
                       'S_sft' = sub(0,NA,clusterings[['S_sft']]) %>% as.factor, 
                       'S_sft_top_clusters' = sub(0,NA,clusterings[['S_sft_top_clusters']]) %>% as.factor, 
                       'S_sft_merged' = sub(0,NA,clusterings[['S_sft_merged']]) %>% as.factor,
                       'TOM' = sub(0,NA,clusterings[['TOM']]) %>% as.factor, 
                       'TOM_top_clusters' = sub(0,NA,clusterings[['TOM_top_clusters']]) %>% as.factor, 
                       'TOM_merged' = sub(0,NA,clusterings[['TOM_merged']]) %>% as.factor)

selectable_scatter_plot(plot_data[,-1,], plot_data[,-1])
rm(pca, plot_data)



Conclusions

  • From the dendrograms, the original similarity matrix leaves almost no gene without cluster, and the other two leave a significant proportion out

  • TOM’s dendrogram is the one with the most defined branches, but the original S dendrogram is the one that separates more clearly into two in the main modules

  • No method seems to have a strong relation with the first principal components

  • We would need to do some type of biological enrichment analysis to see which of these clusterings is better




Save clusterings

clusterings_file = './../Data/Gandal/clusterings.csv'

if(file.exists(clusterings_file)){

  df = read.csv(clusterings_file, row.names=1)
  
  if(!all(rownames(df) == rownames(datExpr))) stop('Gene ordering does not match the one in clusterings.csv!')
  
  for(clustering in names(clusterings)){
    df = df %>% mutate(!!clustering := sub(0, NA, clusterings[[clustering]]))
    rownames(df) = rownames(datExpr)
  }
  
} else {
  
  df = clusterings %>% unlist %>% matrix(nrow=length(clusterings), byrow=T) %>% t %>% data.frame %>% na_if(0)
  colnames(df) = names(clusterings)
  rownames(df) = rownames(datExpr)

}

write.csv(df, file=clusterings_file)

rm(clusterings_file, df, clustering)

Session info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] pdfCluster_1.0-3            doParallel_1.0.14          
##  [3] iterators_1.0.9             foreach_1.4.4              
##  [5] WGCNA_1.66                  fastcluster_1.1.25         
##  [7] dynamicTreeCut_1.63-1       sva_3.30.1                 
##  [9] genefilter_1.64.0           mgcv_1.8-26                
## [11] nlme_3.1-137                DESeq2_1.22.2              
## [13] SummarizedExperiment_1.12.0 DelayedArray_0.8.0         
## [15] BiocParallel_1.16.6         matrixStats_0.54.0         
## [17] Biobase_2.42.0              GenomicRanges_1.34.0       
## [19] GenomeInfoDb_1.18.2         IRanges_2.16.0             
## [21] S4Vectors_0.20.1            BiocGenerics_0.28.0        
## [23] biomaRt_2.38.0              gplots_3.0.1               
## [25] dendextend_1.10.0           gridExtra_2.3              
## [27] viridis_0.5.1               viridisLite_0.3.0          
## [29] RColorBrewer_1.1-2          plotlyutils_0.0.0.9000     
## [31] plotly_4.8.0                glue_1.3.1                 
## [33] reshape2_1.4.3              forcats_0.3.0              
## [35] stringr_1.4.0               dplyr_0.8.0.1              
## [37] purrr_0.3.1                 readr_1.3.1                
## [39] tidyr_0.8.3                 tibble_2.1.1               
## [41] ggplot2_3.1.0               tidyverse_1.2.1            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.1.0           backports_1.1.2        Hmisc_4.1-1           
##   [4] plyr_1.8.4             lazyeval_0.2.2         splines_3.5.2         
##   [7] robust_0.4-18          digest_0.6.18          htmltools_0.3.6       
##  [10] GO.db_3.7.0            gdata_2.18.0           magrittr_1.5          
##  [13] checkmate_1.8.5        memoise_1.1.0          fit.models_0.5-14     
##  [16] cluster_2.0.7-1        limma_3.38.3           annotate_1.60.1       
##  [19] modelr_0.1.4           prettyunits_1.0.2      colorspace_1.4-1      
##  [22] rrcov_1.4-3            blob_1.1.1             rvest_0.3.2           
##  [25] haven_1.1.1            xfun_0.5               crayon_1.3.4          
##  [28] RCurl_1.95-4.10        jsonlite_1.6           impute_1.56.0         
##  [31] survival_2.43-3        gtable_0.2.0           zlibbioc_1.28.0       
##  [34] XVector_0.22.0         kernlab_0.9-27         prabclus_2.2-7        
##  [37] DEoptimR_1.0-8         abind_1.4-5            scales_1.0.0          
##  [40] mvtnorm_1.0-7          DBI_1.0.0              Rcpp_1.0.1            
##  [43] xtable_1.8-3           progress_1.2.0         htmlTable_1.11.2      
##  [46] magic_1.5-9            foreign_0.8-71         bit_1.1-14            
##  [49] mclust_5.4             preprocessCore_1.44.0  Formula_1.2-3         
##  [52] htmlwidgets_1.3        httr_1.4.0             fpc_2.1-11.1          
##  [55] acepack_1.4.1          modeltools_0.2-22      pkgconfig_2.0.2       
##  [58] XML_3.98-1.11          flexmix_2.3-15         nnet_7.3-12           
##  [61] locfit_1.5-9.1         tidyselect_0.2.5       rlang_0.3.2           
##  [64] AnnotationDbi_1.44.0   munsell_0.5.0          cellranger_1.1.0      
##  [67] tools_3.5.2            cli_1.1.0              generics_0.0.2        
##  [70] RSQLite_2.1.1          broom_0.5.1            geometry_0.4.0        
##  [73] evaluate_0.13          yaml_2.2.0             knitr_1.22            
##  [76] bit64_0.9-7            robustbase_0.93-0      caTools_1.17.1        
##  [79] whisker_0.3-2          xml2_1.2.0             compiler_3.5.2        
##  [82] rstudioapi_0.7         geneplotter_1.60.0     pcaPP_1.9-73          
##  [85] stringi_1.4.3          lattice_0.20-38        trimcluster_0.1-2.1   
##  [88] Matrix_1.2-15          pillar_1.3.1           data.table_1.12.0     
##  [91] bitops_1.0-6           R6_2.4.0               latticeExtra_0.6-28   
##  [94] KernSmooth_2.23-15     codetools_0.2-15       MASS_7.3-51.1         
##  [97] gtools_3.5.0           assertthat_0.2.1       withr_2.1.2           
## [100] GenomeInfoDbData_1.2.0 diptest_0.75-7         hms_0.4.2             
## [103] grid_3.5.2             rpart_4.1-13           class_7.3-14          
## [106] rmarkdown_1.12         lubridate_1.7.4        base64enc_0.1-3